Biden

targetFile <- "data/biden.csv"
data <- read.csv(targetFile)

mse <- function(model, data) {
    x <- modelr:::residuals(model, data)
    mean(x^2, na.rm = TRUE)
}


lin_Biden <- lm(biden ~ age + female + educ + dem + rep, data = data)
lin_mse <- mse(lin_Biden, data)
summary(lin_Biden)
## 
## Call:
## lm(formula = biden ~ age + female + educ + dem + rep, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75.55 -11.29   1.02  12.78  53.98 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  58.8113     3.1244   18.82  < 2e-16 ***
## age           0.0483     0.0282    1.71    0.088 .  
## female        4.1032     0.9482    4.33  1.6e-05 ***
## educ         -0.3453     0.1948   -1.77    0.076 .  
## dem          15.4243     1.0680   14.44  < 2e-16 ***
## rep         -15.8495     1.3114  -12.09  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.9 on 1801 degrees of freedom
## Multiple R-squared:  0.282,  Adjusted R-squared:  0.28 
## F-statistic:  141 on 5 and 1801 DF,  p-value: <2e-16

The MSE for the whole set is 395.27.

data_split <- resample_partition(data, c(test = 0.3, train = 0.7))
lin_Biden_train <- lm(biden ~ age + female + educ + dem + rep, data = data_split$train)
lin_mse_val <- mse(lin_Biden_train, data_split$test)

The MSE for of the training set for the test set is 401.628. This is higher than when fitted again the whole set, suggesting the whole set fit led to over fitting.

mseCal <- function(i) {
    # 100 different splits
    if (i == 30) {
        i <- 30.1
    }
    r <- i/100
    split <- resample_partition(data, c(test = r, train = 1 - r))
    lin_model <- lm(biden ~ age + female + educ + dem + rep, data = split$train)
    mse(lin_model, split$test)
}

set.seed(1234)
df <- data.frame(index = 1:100)
df$mse <- unlist(lapply(df$index, mseCal))


ggplot(data = df, aes(x = index, mse)) + geom_smooth() + geom_point() + labs(title = "MSE vs Percentage of elements in the testing set", 
    x = "Percentage of elements in the testing set", y = "MSE")

We can see from the plot that most partitions result in an MSE of 400, but both high and low percentages result in higher MSE, dues to over fitting and under fitting respectively. Although \(100%\) inclusion in the training set results in lower MSE, like we saw in section 1.

loocv_data <- crossv_kfold(data, k = nrow(data))
loocv_models <- map(loocv_data$train, ~lm(biden ~ age + female + educ + dem + 
    rep, data = .))
loocv_mse <- map2_dbl(loocv_models, loocv_data$test, mse)
loocv_mean_mse <- mean(loocv_mse)

The mean MSE under leave-one-out is 397.956. This is about what we saw in the center of the 100 splits and seems to be the lowest we can get without over fitting.

mseFoldCal <- function(i) {
    tenFold_data <- crossv_kfold(data, k = 10)
    tenFold_models <- map(tenFold_data$train, ~lm(biden ~ age + female + educ + 
        dem + rep, data = .))
    tenFold_mse <- map2_dbl(tenFold_models, tenFold_data$test, mse)
    tenFold_mean_mse <- mean(tenFold_mse)
}


set.seed(1234)
ten_fold_df <- data.frame(index = 1:100)
ten_fold_df$mse <- unlist(lapply(ten_fold_df$index, mseFoldCal))

The mean MSE under 10-fold CV is 397.884. This is about what we saw in the center of the 100 splits, and very similar to the mean of the LOOCV, it still seems to be the lowest we can get without over fitting.

qplot(mse, data = ten_fold_df, geom = "histogram", main = "Histogram of model MSE under 10-fold CV", 
    binwidth = 0.1, xlab = "MSE", ylab = "Count")

The histogram of 10-fold MSEs shows that the MSE is always very close to 398 under different iterations of 10-fold CV.This is very similar to the of the LOOCV which was similar under most cuts.

biden_boot <- data %>% modelr::bootstrap(1000) %>% mutate(model = map(strap, 
    ~lm(biden ~ age + female + educ + dem + rep, data = .)), coef = map(model, 
    tidy))

biden_boot %>% unnest(coef) %>% group_by(term) %>% summarize(est.boot = mean(estimate), 
    se.boot = sd(estimate, na.rm = TRUE))
## # A tibble: 6 × 3
##          term est.boot se.boot
##         <chr>    <dbl>   <dbl>
## 1 (Intercept)   58.937  2.9781
## 2         age    0.048  0.0288
## 3         dem   15.438  1.1129
## 4        educ   -0.353  0.1912
## 5      female    4.103  0.9544
## 6         rep  -15.868  1.4452
summary(lin_Biden)
## 
## Call:
## lm(formula = biden ~ age + female + educ + dem + rep, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75.55 -11.29   1.02  12.78  53.98 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  58.8113     3.1244   18.82  < 2e-16 ***
## age           0.0483     0.0282    1.71    0.088 .  
## female        4.1032     0.9482    4.33  1.6e-05 ***
## educ         -0.3453     0.1948   -1.77    0.076 .  
## dem          15.4243     1.0680   14.44  < 2e-16 ***
## rep         -15.8495     1.3114  -12.09  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.9 on 1801 degrees of freedom
## Multiple R-squared:  0.282,  Adjusted R-squared:  0.28 
## F-statistic:  141 on 5 and 1801 DF,  p-value: <2e-16

These are very similar estimated values to those provided by the initial fit (e.g. age differs by only \(2.5%\)), but the errors for the the bootstrap are larger.

College

targetFile <- "data/college.csv"
data <- read.csv(targetFile)
lin_coll <- lm(Outstate ~ ., data = data)
summary(lin_coll)
## 
## Call:
## lm(formula = Outstate ~ ., data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6783  -1267    -41   1244   9953 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.59e+03   7.66e+02   -2.07   0.0386 *  
## PrivateYes   2.26e+03   2.48e+02    9.13  < 2e-16 ***
## Apps        -3.03e-01   6.73e-02   -4.51  7.6e-06 ***
## Accept       8.12e-01   1.29e-01    6.29  5.5e-10 ***
## Enroll      -5.49e-01   3.54e-01   -1.55   0.1213    
## Top10perc    2.83e+01   1.10e+01    2.58   0.0100 *  
## Top25perc   -3.78e+00   8.47e+00   -0.45   0.6558    
## F.Undergrad -9.57e-02   6.15e-02   -1.55   0.1204    
## P.Undergrad  1.17e-02   6.05e-02    0.19   0.8472    
## Room.Board   8.82e-01   8.56e-02   10.30  < 2e-16 ***
## Books       -4.59e-01   4.48e-01   -1.03   0.3055    
## Personal    -2.29e-01   1.18e-01   -1.94   0.0528 .  
## PhD          1.12e+01   8.73e+00    1.29   0.1982    
## Terminal     2.47e+01   9.54e+00    2.59   0.0099 ** 
## S.F.Ratio   -4.64e+01   2.44e+01   -1.90   0.0575 .  
## perc.alumni  4.18e+01   7.56e+00    5.53  4.5e-08 ***
## Expend       1.99e-01   2.27e-02    8.77  < 2e-16 ***
## Grad.Rate    2.40e+01   5.51e+00    4.36  1.5e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1960 on 759 degrees of freedom
## Multiple R-squared:  0.768,  Adjusted R-squared:  0.763 
## F-statistic:  148 on 17 and 759 DF,  p-value: <2e-16

Doing a linear fit across all the variables shows lets us pick a few variables that are likely to be significant. Lets look at: Expend, Room.Board and PrivateYes

lin_Expend <- lm(Outstate ~ Expend, data = data)
summary(lin_Expend)
## 
## Call:
## lm(formula = Outstate ~ Expend, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15781  -2089     58   2011   7785 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.43e+03   2.25e+02    24.2   <2e-16 ***
## Expend      5.18e-01   2.05e-02    25.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2980 on 775 degrees of freedom
## Multiple R-squared:  0.453,  Adjusted R-squared:  0.452 
## F-statistic:  641 on 1 and 775 DF,  p-value: <2e-16
lin_Room.Board <- lm(Outstate ~ Room.Board, data = data)
summary(lin_Room.Board)
## 
## Call:
## lm(formula = Outstate ~ Room.Board, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8781  -2071   -351   1877  11877 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.4453   447.7679   -0.04     0.97    
## Room.Board    2.4000     0.0997   24.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3040 on 775 degrees of freedom
## Multiple R-squared:  0.428,  Adjusted R-squared:  0.427 
## F-statistic:  580 on 1 and 775 DF,  p-value: <2e-16
lin_Apps <- lm(Outstate ~ Apps, data = data)
summary(lin_Apps)
## 
## Call:
## lm(formula = Outstate ~ Apps, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8328  -3176   -402   2526  11389 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.03e+04   1.83e+02    56.3   <2e-16 ***
## Apps        5.21e-02   3.73e-02     1.4     0.16    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4020 on 775 degrees of freedom
## Multiple R-squared:  0.00252,    Adjusted R-squared:  0.00123 
## F-statistic: 1.95 on 1 and 775 DF,  p-value: 0.162

Looking at the summaries of the models shows that the \(R^2\) values are very low (around \(.4\)) and that the Apps p-value has increased significantly (its \(R^2\) is also low). The best fit, by \(R^2\) is for expend. So lets plot that:

expendDF <- add_predictions(data, lin_Expend)
expendDF <- add_residuals(expendDF, lin_Expend)

ggplot(expendDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() + 
    labs(title = "Linear model regression for Expend", x = "Predicted expenditure", 
        y = "Residuals")

The plot show that the model is mostly accurate for expenditures around \(10 000 USD\) but gets worse as the expenditure increases. This likely due to a non-linear term becoming dominant. So lets try a polynomial fit, to find the polynomial we will use 10-fold CV.

set.seed(1234)
tenFold_data <- crossv_kfold(data, k = 10)

polyMSE <- function(d) {
    tenFold_models <- map(tenFold_data$train, ~lm(Outstate ~ poly(Expend, d), 
        data = .))
    tenFold_mse <- map2_dbl(tenFold_models, tenFold_data$test, mse)
    tenFold_mean_mse <- mean(tenFold_mse)
}

tenFoldDF <- data.frame(index = 1:10)
tenFoldDF$mse <- unlist(lapply(1:10, polyMSE))

ggplot(tenFoldDF, aes(index, mse)) + geom_line() + geom_point() + scale_y_log10() + 
    labs(title = "MSE vs polynomial fit degree for Expend", x = "Degree", y = "MSE")

We can see that the lowest MSE is obtained for a polynomial of degree 3. So lets SE what that model leads to.

poly3_Expend <- lm(Outstate ~ poly(Expend, 3), data = data)
summary(poly3_Expend)
## 
## Call:
## lm(formula = Outstate ~ poly(Expend, 3), data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11428  -1513    200   1722   5932 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10440.7       91.1  114.59  < 2e-16 ***
## poly(Expend, 3)1  75397.1     2539.8   29.69  < 2e-16 ***
## poly(Expend, 3)2 -41623.0     2539.8  -16.39  < 2e-16 ***
## poly(Expend, 3)3  12483.0     2539.8    4.91  1.1e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2540 on 773 degrees of freedom
## Multiple R-squared:  0.603,  Adjusted R-squared:  0.601 
## F-statistic:  391 on 3 and 773 DF,  p-value: <2e-16

An \(R^2\) higher than \(.5\) that’s a much better fit. Lets plot it:

expendDF <- add_predictions(data, poly3_Expend)
expendDF <- add_residuals(expendDF, poly3_Expend)

ggplot(expendDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() + 
    labs(title = "3rd order polynomial model regression for Expend", x = "Predicted expenditure", 
        y = "Residuals")

This model has none of the edge effects of the linear fit and is a much better fit for the data all around. Since that went well, lets try a nonlinear model for the worst fit Apps, but this time a spline. First we will find the optimal number of knots with 3rd order piece wise polynomials:

set.seed(1234)
tenFold_data <- crossv_kfold(data, k = 10)

polyMSE <- function(n) {
    tenFold_models <- map(tenFold_data$train, ~glm(Outstate ~ bs(Apps, df = n), 
        data = .))
    tenFold_mse <- map2_dbl(tenFold_models, tenFold_data$test, mse)
    tenFold_mean_mse <- mean(tenFold_mse)
}

tenFoldDF <- data.frame(index = 1:10)
tenFoldDF$mse <- unlist(lapply(1:10, polyMSE))

ggplot(tenFoldDF, aes(index, mse)) + geom_line() + geom_point() + labs(title = "MSE vs number of knots", 
    x = "Number of knots", y = "MSE")

We see the minimum is at 8, but that 4 is almost as low so we will try that.

spline4_Apps <- glm(Outstate ~ bs(Apps, df = 4), data = data)
summary(spline4_Apps)
## 
## Call:
## glm(formula = Outstate ~ bs(Apps, df = 4), data = data)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -8298   -3135    -143    2491   11882  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7858        705   11.14  < 2e-16 ***
## bs(Apps, df = 4)1     3026        881    3.44  0.00062 ***
## bs(Apps, df = 4)2     1360       2029    0.67  0.50308    
## bs(Apps, df = 4)3     8809       6435    1.37  0.17140    
## bs(Apps, df = 4)4     -339       4042   -0.08  0.93314    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 15934896)
## 
##     Null deviance: 1.2559e+10  on 776  degrees of freedom
## Residual deviance: 1.2302e+10  on 772  degrees of freedom
## AIC: 15098
## 
## Number of Fisher Scoring iterations: 2

Still not that good, lets see the plot

expendDF <- add_predictions(data, spline4_Apps)
expendDF <- add_residuals(expendDF, spline4_Apps)

ggplot(expendDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() + 
    labs(title = "3rd order polynomial model regression for Expend", x = "Predicted expenditure", 
        y = "Residuals")

Looks like this is still a bad fit and in fact using any kind of polynomial based fit is not good for this data and it likely has very little predictive power regardless.

GAM College

targetFile <- "data/college.csv"
data <- read.csv(targetFile)

data_split <- resample_partition(data, c(test = 0.3, train = 0.7))
lin_college <- lm(Outstate ~ Private + Room.Board + PhD + perc.alumni + Expend + 
    Grad.Rate, data = data_split$train)
summary(lin_college)
## 
## Call:
## lm(formula = Outstate ~ Private + Room.Board + PhD + perc.alumni + 
##     Expend + Grad.Rate, data = data_split$train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7851  -1298   -102   1282  10644 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3812.153    523.671   -7.28  1.2e-12 ***
## PrivateYes   2736.038    254.142   10.77  < 2e-16 ***
## Room.Board      1.066      0.104   10.20  < 2e-16 ***
## PhD            40.068      6.694    5.99  3.9e-09 ***
## perc.alumni    48.387      9.004    5.37  1.2e-07 ***
## Expend          0.204      0.020   10.20  < 2e-16 ***
## Grad.Rate      25.980      6.715    3.87  0.00012 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2070 on 537 degrees of freedom
## Multiple R-squared:  0.751,  Adjusted R-squared:  0.749 
## F-statistic:  271 on 6 and 537 DF,  p-value: <2e-16

The summary shows that all the values are significant and that together their \(R^2\) indicates \(75%\) of the variance is accounted for. We can look at the residuals plot to see if were the \(25%\) may be coming from

fullDF <- add_predictions(data, lin_college)
fullDF <- add_residuals(fullDF, lin_college)

ggplot(fullDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() + labs(title = "Residuals vs predicted values for the full linear fit", 
    x = "Predicted expenditure", y = "Residuals")

The error appears to be largest for large predicted values, lets see if a a GAM can improve this. We will use the sum of loess fits for our continuous variables since the loess fits is a nice smooth fit and should work with most distributions.

college_gam <- gam(Outstate ~ lo(perc.alumni) + lo(PhD) + lo(Expend) + lo(Grad.Rate) + 
    lo(Room.Board) + Private, data = data_split$train)
summary(college_gam)
## 
## Call: gam(formula = Outstate ~ lo(perc.alumni) + lo(PhD) + lo(Expend) + 
##     lo(Grad.Rate) + lo(Room.Board) + Private, data = data_split$train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6904.3 -1138.8    33.4  1189.9  7157.0 
## 
## (Dispersion Parameter for gaussian family taken to be 3361360)
## 
##     Null Deviance: 9.22e+09 on 543 degrees of freedom
## Residual Deviance: 1.76e+09 on 522 degrees of freedom
## AIC: 9742 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                  Df   Sum Sq  Mean Sq F value Pr(>F)    
## lo(perc.alumni)   1 2.32e+09 2.32e+09     689 <2e-16 ***
## lo(PhD)           1 5.23e+08 5.23e+08     156 <2e-16 ***
## lo(Expend)        1 1.96e+09 1.96e+09     583 <2e-16 ***
## lo(Grad.Rate)     1 3.79e+08 3.79e+08     113 <2e-16 ***
## lo(Room.Board)    1 4.88e+08 4.88e+08     145 <2e-16 ***
## Private           1 3.88e+08 3.88e+08     115 <2e-16 ***
## Residuals       522 1.76e+09 3.36e+06                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                 Npar Df Npar F  Pr(F)    
## (Intercept)                              
## lo(perc.alumni)     2.4   4.22 0.0108 *  
## lo(PhD)             2.7   2.70 0.0507 .  
## lo(Expend)          4.1  29.90 <2e-16 ***
## lo(Grad.Rate)       2.7   2.35 0.0787 .  
## lo(Room.Board)      2.9   4.16 0.0069 ** 
## Private                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Even better \(R^2\) values lets look at the residuals plot

gamDF <- add_predictions(data, college_gam)
gamDF <- add_residuals(gamDF, college_gam)

ggplot(gamDF, aes(x = pred, y = resid)) + geom_smooth() + geom_point() + labs(title = "Residuals vs predicted values for the full linear fit", 
    x = "Predicted expenditure", y = "Residuals")

Much less edge effect, it looks like the GAM model has much lower error. Lets look at a few of the individual components:

college_gam_terms <- preplot(college_gam, se = TRUE, rug = FALSE)

data_frame(x = college_gam_terms$`lo(perc.alumni)`$x, y = college_gam_terms$`lo(perc.alumni)`$y, 
    se.fit = college_gam_terms$`lo(perc.alumni)`$se.y) %>% mutate(y_low = y - 
    1.96 * se.fit, y_high = y + 1.96 * se.fit) %>% ggplot(aes(x, y)) + geom_line() + 
    geom_line(aes(y = y_low), linetype = 2) + geom_line(aes(y = y_high), linetype = 2) + 
    labs(title = "GAM of out-of-state tuition", x = "Percentage of donating alumni", 
        y = expression(f[1](perc.alumni)))

We can see that higher rates of alumni donations tend to increase out-of-state tuition.

data_frame(x = college_gam_terms$`lo(PhD)`$x, y = college_gam_terms$`lo(PhD)`$y, 
    se.fit = college_gam_terms$`lo(PhD)`$se.y) %>% mutate(y_low = y - 1.96 * 
    se.fit, y_high = y + 1.96 * se.fit) %>% ggplot(aes(x, y)) + geom_line() + 
    geom_line(aes(y = y_low), linetype = 2) + geom_line(aes(y = y_high), linetype = 2) + 
    labs(title = "GAM of out-of-state tuition", x = "Percentage of faculty with PhDs", 
        y = expression(f[1](PhD)))

We can see that higher rates of faculty with PhD’s tend to increase out-of-state tuition in general but there is a local maxima around \(30%\), even when extend to \(95%\) confidence interval.

data_frame(x = college_gam_terms$Private$x, y = college_gam_terms$Private$y, 
    se.fit = college_gam_terms$Private$se.y) %>% unique %>% mutate(y_low = y - 
    1.96 * se.fit, y_high = y + 1.96 * se.fit, x = factor(x, levels = c("No", 
    "Yes"), labels = c("Public", "Private"))) %>% ggplot(aes(x, y, ymin = y_low, 
    ymax = y_high)) + geom_errorbar() + geom_point() + labs(title = "GAM of out-of-state tuition", 
    x = NULL, y = expression(f[3](gender)))

We can see that being private has a statistical significant and large effect on out-of-state tuition.

lin_mse <- mse(lin_college, data_split$test)
gam_mse <- mse(college_gam, data_split$test)

The MSE for the linear model is 3.84810^{6} while for the GAM is gam_mse. This is not that large of a difference. Going to the much more complicated GAM yields only \(.3%\) improvement. This is likely due to the GAME being over fitted, using polynomials or less other less complicated components in the GAM than loess fits could likely improve the MSE by reducing the levels of over fitting.

summary(college_gam)
## 
## Call: gam(formula = Outstate ~ lo(perc.alumni) + lo(PhD) + lo(Expend) + 
##     lo(Grad.Rate) + lo(Room.Board) + Private, data = data_split$train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -6904.3 -1138.8    33.4  1189.9  7157.0 
## 
## (Dispersion Parameter for gaussian family taken to be 3361360)
## 
##     Null Deviance: 9.22e+09 on 543 degrees of freedom
## Residual Deviance: 1.76e+09 on 522 degrees of freedom
## AIC: 9742 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                  Df   Sum Sq  Mean Sq F value Pr(>F)    
## lo(perc.alumni)   1 2.32e+09 2.32e+09     689 <2e-16 ***
## lo(PhD)           1 5.23e+08 5.23e+08     156 <2e-16 ***
## lo(Expend)        1 1.96e+09 1.96e+09     583 <2e-16 ***
## lo(Grad.Rate)     1 3.79e+08 3.79e+08     113 <2e-16 ***
## lo(Room.Board)    1 4.88e+08 4.88e+08     145 <2e-16 ***
## Private           1 3.88e+08 3.88e+08     115 <2e-16 ***
## Residuals       522 1.76e+09 3.36e+06                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                 Npar Df Npar F  Pr(F)    
## (Intercept)                              
## lo(perc.alumni)     2.4   4.22 0.0108 *  
## lo(PhD)             2.7   2.70 0.0507 .  
## lo(Expend)          4.1  29.90 <2e-16 ***
## lo(Grad.Rate)       2.7   2.35 0.0787 .  
## lo(Room.Board)      2.9   4.16 0.0069 ** 
## Private                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The ANOVA test indicates that Expend is with a high likelihood non-linear, while Room.Board is also quite possibly no-linear. Lets look at the plot of Expend

data_frame(x = college_gam_terms$`lo(Expend)`$x, y = college_gam_terms$`lo(Expend)`$y, 
    se.fit = college_gam_terms$`lo(Expend)`$se.y) %>% mutate(y_low = y - 1.96 * 
    se.fit, y_high = y + 1.96 * se.fit) %>% ggplot(aes(x, y)) + geom_line() + 
    geom_line(aes(y = y_low), linetype = 2) + geom_line(aes(y = y_high), linetype = 2) + 
    labs(title = "GAM of out-of-state tuition", x = "Expenditure per student", 
        y = expression(f[1](Expend)))

As you can see from the plot the expenditure per student is non-linear and we also saw this effecting our ability to use it in part 2.